Load libraries and setup

Introduction

This file contains analyses that identify the overall correlation between GPT-4’s ratings and human ratings for different psycholinguistic norms.

Dataset 1: Iconicity

df_gpt = read_csv("../../data/processed/iconicity/iconicity_gpt-4.csv")
## Rows: 14772 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, gpt-4_response
## dbl (1): gpt4_rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_gpt)
## [1] 14772
df_human = read_csv("../../data/raw/iconicity/iconicity.csv")
## Rows: 14776 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): word
## dbl (5): n_ratings, n, prop_known, rating, rating_sd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_human)
## [1] 14776
df_merged = df_human %>%
  inner_join(df_gpt)
## Joining, by = "word"
nrow(df_merged)
## [1] 14772
### How correlated?
cor.test(df_merged$rating, df_merged$gpt4_rating)
## 
##  Pearson's product-moment correlation
## 
## data:  df_merged$rating and df_merged$gpt4_rating
## t = 98.953, df = 14770, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6215975 0.6409944
## sample estimates:
##       cor 
## 0.6313947
cor.test(df_merged$rating, df_merged$gpt4_rating, method = "spearman")
## Warning in cor.test.default(df_merged$rating, df_merged$gpt4_rating, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_merged$rating and df_merged$gpt4_rating
## S = 2.1867e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5929681
HUMAN_AGREEMENT_ICONICITY = 0.35

df_corr = df_merged %>%
  summarise(r = cor(rating, gpt4_rating, method = "spearman")) %>%
  mutate(dimension = "Iconicity")

df_corr %>%
  ggplot(aes(x = dimension, y = r)) +
  geom_bar(stat = "identity", alpha = .7) +
  scale_y_continuous(limits = c(0, 1)) +
  geom_hline(yintercept = HUMAN_AGREEMENT_ICONICITY, linetype = "dashed",
             color = "steelblue3", size = 1) +
  labs(x = "", y = "Correlation") +
  theme_minimal()

df_merged %>%
  ggplot(aes(x = gpt4_rating)) +
  geom_histogram(alpha = .5, bins = 7)

df_merged %>%
  ggplot(aes(x = rating)) +
  geom_histogram(alpha = .5, bins = 7)

df_merged %>%
  ggplot(aes(x = gpt4_rating, y = rating)) +
  geom_point(alpha = .6) +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "GPT-4 Iconicity Judgment",
       y = "Human Iconicity Judgment")
## `geom_smooth()` using formula 'y ~ x'

Error Analysis

df_merged = df_merged %>%
  mutate(diff = gpt4_rating - rating,
         z = (gpt4_rating - rating) / rating_sd,
         abs_diff = abs(diff),
         abs_z = abs(z))

df_merged %>%
  ggplot(aes(x = abs_z)) +
  geom_histogram(alpha = .5, bins = 7)

Qualitative

dftop20 = df_merged %>%
  arrange(desc(abs_z)) %>%
  head(20)

summary(dftop20$rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.300   1.955   5.900   4.609   6.400   6.923
summary(dftop20$gpt4_rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    3.00    4.00    4.05    5.00    6.00
dftop20
## # A tibble: 20 × 12
##    word     n_ratings     n prop_k…¹ rating ratin…² gpt-4…³ gpt4_…⁴   diff     z
##    <chr>        <dbl> <dbl>    <dbl>  <dbl>   <dbl> <chr>     <dbl>  <dbl> <dbl>
##  1 oomph           13    13    1       6.92   0.277 5             5 -1.92  -6.93
##  2 hi              10    10    1       6.2    0.789 3             3 -3.2   -4.06
##  3 legit           10    10    1       1.6    0.699 4             4  2.4    3.43
##  4 lead            10    10    1       1.7    0.675 4             4  2.3    3.41
##  5 trio            10    10    1       6.3    0.675 4             4 -2.3   -3.41
##  6 gosh            10    10    1       6      0.943 3             3 -3     -3.18
##  7 chopper         10    10    1       6.6    0.516 5             5 -1.6   -3.10
##  8 powered         10    10    1       5.8    0.919 3             3 -2.8   -3.05
##  9 swish           11    11    1       6.91   0.302 6             6 -0.909 -3.02
## 10 shape           12    12    1       1.42   0.900 4             4  2.58   2.87
## 11 wiggle          10    10    1       6.9    0.316 6             6 -0.9   -2.85
## 12 popper          10    10    1       6.4    0.843 4             4 -2.4   -2.85
## 13 direful         10    12    0.833   2      1.05  5             5  3      2.85
## 14 swash           10    11    0.909   6.3    0.823 4             4 -2.3   -2.79
## 15 wring           10    11    0.909   6.4    0.516 5             5 -1.4   -2.71
## 16 heedless        11    11    1       2.27   0.647 4             4  1.73   2.67
## 17 taxation        11    11    1       4.64   1.03  2             2 -2.64  -2.57
## 18 dictate         10    10    1       4.7    0.675 3             3 -1.7   -2.52
## 19 partial         10    10    1       1.3    0.675 3             3  1.7    2.52
## 20 below           11    11    1       1.82   0.874 4             4  2.18   2.50
## # … with 2 more variables: abs_diff <dbl>, abs_z <dbl>, and abbreviated
## #   variable names ¹​prop_known, ²​rating_sd, ³​`gpt-4_response`, ⁴​gpt4_rating
## # ℹ Use `colnames()` to see all variable names

Dataset 2: SimLex

df_gpt = read_csv("../../data/processed/simlex/simlex_gpt-4.csv")
## Rows: 999 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): word1, word2, POS, gpt-4_response
## dbl (8): SimLex999, conc(w1), conc(w2), concQ, Assoc(USF), SimAssoc333, SD(S...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_gpt)
## [1] 999
df_human = read_csv("../../data/raw/simlex/simlex.csv")
## Rows: 999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): word1, word2, POS
## dbl (7): SimLex999, conc(w1), conc(w2), concQ, Assoc(USF), SimAssoc333, SD(S...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_human)
## [1] 999
df_merged = df_human %>%
  inner_join(df_gpt)
## Joining, by = c("word1", "word2", "POS", "SimLex999", "conc(w1)", "conc(w2)",
## "concQ", "Assoc(USF)", "SimAssoc333", "SD(SimLex)")
nrow(df_merged)
## [1] 999
### How correlated?
cor.test(df_merged$SimLex999, df_merged$gpt4_rating)
## 
##  Pearson's product-moment correlation
## 
## data:  df_merged$SimLex999 and df_merged$gpt4_rating
## t = 53.444, df = 997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8440121 0.8761999
## sample estimates:
##       cor 
## 0.8609654
cor.test(df_merged$SimLex999, df_merged$gpt4_rating, method = "spearman")
## Warning in cor.test.default(df_merged$SimLex999, df_merged$gpt4_rating, : Cannot
## compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_merged$SimLex999 and df_merged$gpt4_rating
## S = 22589372, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8640562
HUMAN_AGREEMENT_SIMLEX = 0.78

df_corr = df_merged %>%
  summarise(r = cor(SimLex999, gpt4_rating, method = "spearman")) %>%
  mutate(dimension = "Similarity (SimLex999)")

df_corr %>%
  ggplot(aes(x = dimension, y = r)) +
  geom_bar(stat = "identity", alpha = .7) +
  scale_y_continuous(limits = c(0, 1)) +
  geom_hline(yintercept = HUMAN_AGREEMENT_SIMLEX, , linetype = "dashed",
             color = "steelblue3", size = 1) +
  labs(x = "", y = "Correlation") +
  theme_minimal()

df_merged %>%
  ggplot(aes(x = gpt4_rating, y = SimLex999)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "GPT-4 Similarity Judgment",
       y = "Human Similarity Judgment")
## `geom_smooth()` using formula 'y ~ x'

df_merged %>%
  ggplot(aes(x = gpt4_rating, y = SimLex999)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "GPT-4 Similarity Judgment",
       y = "Human Similarity Judgment") +
  facet_wrap(~SimAssoc333)
## `geom_smooth()` using formula 'y ~ x'

Error Analysis

df_merged = df_merged %>%
  mutate(diff = gpt4_rating - SimLex999,
         abs_diff = abs(diff))

df_merged %>%
  ggplot(aes(x = diff)) +
  geom_histogram(alpha = .5, bins = 7)

df_merged %>%
  ggplot(aes(x = SimAssoc333, y = abs_diff)) +
  geom_jitter(alpha = .1, width = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  theme_minimal() +
  labs(x = "Associated (0 = no, 1 = yes)",
       y = "Absolute error")

df_merged %>%
  ggplot(aes(y = POS, x = abs_diff)) +
  geom_jitter(alpha = .1, width = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  theme_minimal() +
  labs(y = "Part-of-Speech",
       x = "Absolute Error")

df_merged %>%
  ggplot(aes(y = factor(concQ), x = abs_diff)) +
  geom_jitter(alpha = .1, width = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  theme_minimal() +
  labs(y = "Concreteness Quartile",
       x = "Absolute Error")

m_full = lm(data = df_merged, abs_diff ~ concQ + POS)
m_conc = lm(data = df_merged, abs_diff ~ concQ)
m_pos = lm(data = df_merged, abs_diff ~ POS)

anova(m_conc, m_full)
## Analysis of Variance Table
## 
## Model 1: abs_diff ~ concQ
## Model 2: abs_diff ~ concQ + POS
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1    997 1005.85                                  
## 2    995  979.99  2    25.861 13.128 2.357e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m_pos, m_full)
## Analysis of Variance Table
## 
## Model 1: abs_diff ~ POS
## Model 2: abs_diff ~ concQ + POS
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    996 995.42                                  
## 2    995 979.99  1    15.428 15.664 8.101e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_full)
## 
## Call:
## lm(formula = abs_diff ~ concQ + POS, data = df_merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3862 -0.7260 -0.2572  0.5135  5.4830 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.55708    0.10676   5.218 2.20e-07 ***
## concQ        0.15017    0.03794   3.958 8.10e-05 ***
## POSN         0.20946    0.12061   1.737   0.0828 .  
## POSV         0.54875    0.11556   4.749 2.35e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9924 on 995 degrees of freedom
## Multiple R-squared:  0.04055,    Adjusted R-squared:  0.03765 
## F-statistic: 14.02 on 3 and 995 DF,  p-value: 5.919e-09

Qualitative

dftop20 = df_merged %>%
  arrange(desc(abs_diff)) %>%
  head(20)

summary(dftop20$rating)
## Warning: Unknown or uninitialised column: `rating`.
## Length  Class   Mode 
##      0   NULL   NULL
summary(dftop20$gpt4_rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00    7.00    7.00    7.15    8.00    9.00
dftop20
## # A tibble: 20 × 14
##    word1    word2    POS   SimLe…¹ conc(…² conc(…³ concQ Assoc…⁴ SimAs…⁵ SD(Si…⁶
##    <chr>    <chr>    <chr>   <dbl>   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>
##  1 wife     husband  N        2.3     4.13    4.11     3    8.85       1    0.84
##  2 multiply divide   V        1.75    2.79    2.86     2    2.53       1    1.32
##  3 south    north    N        2.2     3.84    4.14     3    7.72       1    0.59
##  4 sunset   sunrise  N        2.47    4.54    4.69     3    4.8        1    1.12
##  5 dog      cat      N        1.75    4.85    4.86     4    5.13       1    1.3 
##  6 groom    bride    N        3.17    4.54    4.63     3    8.65       1    0.68
##  7 add      divide   V        2.3     3       2.86     2    0.41       0    1.25
##  8 absence  presence N        0.4     2.31    2.72     1    1.38       1    1.5 
##  9 lady     gentlem… N        3.42    4.33    3.57     3    3.29       1    1.08
## 10 go       come     V        2.42    3.15    2.72     2    5.75       1    1.51
## 11 dad      mother   N        3.55    4.29    4.6      3    0.31       0    1.44
## 12 spend    save     V        0.55    2.93    2.42     2    0.61       1    1.06
## 13 north    west     N        3.63    4.14    3.44     3    0.31       0    1.6 
## 14 bottom   top      N        0.7     4.25    3.93     3    6.96       1    1.16
## 15 liver    lung     N        2.7     4.68    4.82     4    0.14       0    1.38
## 16 multiply add      V        2.7     2.79    3        2    0.5        1    1.8 
## 17 rice     bean     N        2.72    4.86    5        4    0.34       0    1.27
## 18 bee      ant      N        2.78    4.88    4.86     4    0.34       0    1.04
## 19 leg      arm      N        2.88    4.83    4.96     4    6.73       1    0.79
## 20 cow      goat     N        2.93    4.96    5        4    0.42       0    0.59
## # … with 4 more variables: `gpt-4_response` <chr>, gpt4_rating <dbl>,
## #   diff <dbl>, abs_diff <dbl>, and abbreviated variable names ¹​SimLex999,
## #   ²​`conc(w1)`, ³​`conc(w2)`, ⁴​`Assoc(USF)`, ⁵​SimAssoc333, ⁶​`SD(SimLex)`
## # ℹ Use `colnames()` to see all variable names

Dataset 3: SimVerb

df_gpt = read_csv("../../data/processed/simverb/simverb_gpt-4.csv")
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 3500 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word1, word2
## dbl (2): gpt-4_response, gpt4_rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_gpt)
## [1] 3500
df_human = read_csv("../../data/raw/simverb/simverb.csv")
## Rows: 3500 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): word1, word2, POS, Relation
## dbl (1): Similarity
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_human)
## [1] 3500
df_merged = df_human %>%
  inner_join(df_gpt)
## Joining, by = c("word1", "word2")
nrow(df_merged)
## [1] 3500
### How correlated?
cor.test(df_merged$Similarity, df_merged$gpt4_rating)
## 
##  Pearson's product-moment correlation
## 
## data:  df_merged$Similarity and df_merged$gpt4_rating
## t = 78.991, df = 3498, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7882583 0.8120784
## sample estimates:
##       cor 
## 0.8004842
cor.test(df_merged$Similarity, df_merged$gpt4_rating, method = "spearman")
## Warning in cor.test.default(df_merged$Similarity, df_merged$gpt4_rating, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_merged$Similarity and df_merged$gpt4_rating
## S = 1381649701, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8066496
HUMAN_AGREEMENT_SIMVERB = 0.86

df_corr = df_merged %>%
  summarise(r = cor(Similarity, gpt4_rating, method = "spearman")) %>%
  mutate(Dimension = "Similarity (SimVerb3500)")

df_corr %>%
  ggplot(aes(x = Dimension, y = r)) +
  geom_bar(stat = "identity", alpha = .7) +
  scale_y_continuous(limits = c(0, 1)) +
  geom_hline(yintercept = HUMAN_AGREEMENT_SIMVERB, , linetype = "dashed",
             color = "steelblue3", size = 1) +
  labs(x = "", y = "Correlation") +
  theme_minimal()

df_merged %>%
  ggplot(aes(x = gpt4_rating, y = Similarity)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "GPT-4 Similarity Judgment",
       y = "Human Similarity Judgment")
## `geom_smooth()` using formula 'y ~ x'

df_merged %>%
  ggplot(aes(x = gpt4_rating, y = Similarity)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "GPT-4 Similarity Judgment",
       y = "Human Similarity Judgment") +
  facet_wrap(~Relation)
## `geom_smooth()` using formula 'y ~ x'

Error Analysis

df_merged = df_merged %>%
  mutate(diff = gpt4_rating - Similarity,
         abs_diff = abs(diff)) 

df_merged %>%
  ggplot(aes(x = diff)) +
  geom_histogram(alpha = .5, bins = 7)

df_merged %>%
  ggplot(aes(y = reorder(Relation, abs_diff), x = abs_diff)) +
  geom_jitter(alpha = .1, width = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  theme_minimal() +
  labs(y = "Relation Type",
       x = "Absolute Error")

df_merged %>%
  group_by(Relation) %>%
  summarise(m_diff = mean(diff),
            sd_diff = sd(diff))
## # A tibble: 5 × 3
##   Relation       m_diff sd_diff
##   <chr>           <dbl>   <dbl>
## 1 ANTONYMS        2.53     1.64
## 2 COHYPONYMS      1.48     2.00
## 3 HYPER/HYPONYMS  0.766    1.47
## 4 NONE            1.19     1.54
## 5 SYNONYMS        0.720    1.60
m = lm(data = df_merged, abs_diff ~ Relation)
m_reduced = lm(data = df_merged, abs_diff ~ 1)
anova(m_reduced, m)
## Analysis of Variance Table
## 
## Model 1: abs_diff ~ 1
## Model 2: abs_diff ~ Relation
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   3499 5149.4                                  
## 2   3495 4924.8  4    224.63 39.853 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m)
## 
## Call:
## lm(formula = abs_diff ~ Relation, data = df_merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4220 -0.8890 -0.2409  0.6210  6.1210 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.5920     0.1127  23.005  < 2e-16 ***
## RelationCOHYPONYMS      -0.6565     0.1418  -4.629 3.81e-06 ***
## RelationHYPER/HYPONYMS  -1.3211     0.1202 -10.988  < 2e-16 ***
## RelationNONE            -1.0430     0.1156  -9.021  < 2e-16 ***
## RelationSYNONYMS        -1.2767     0.1315  -9.711  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.187 on 3495 degrees of freedom
## Multiple R-squared:  0.04362,    Adjusted R-squared:  0.04253 
## F-statistic: 39.85 on 4 and 3495 DF,  p-value: < 2.2e-16

Qualitative

dftop20 = df_merged %>%
  arrange(desc(abs_diff)) %>%
  head(20)

summary(dftop20$rating)
## Warning: Unknown or uninitialised column: `rating`.
## Length  Class   Mode 
##      0   NULL   NULL
summary(dftop20$gpt4_rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.0     7.5     7.2     8.0     9.0
dftop20
## # A tibble: 20 × 9
##    word1     word2    POS   Similarity Relation    gpt-4…¹ gpt4_…²  diff abs_d…³
##    <chr>     <chr>    <chr>      <dbl> <chr>         <dbl>   <dbl> <dbl>   <dbl>
##  1 object    disagree V           7.85 COHYPONYMS       NA       0 -7.85    7.85
##  2 incline   decline  V           0.33 NONE              8       8  7.67    7.67
##  3 win       defeat   V           1.49 NONE              9       9  7.51    7.51
##  4 jerk      prick    V           0.5  NONE              8       8  7.5     7.5 
##  5 multiply  divide   V           0.5  ANTONYMS          8       8  7.5     7.5 
##  6 disallow  allow    V           0.5  ANTONYMS          8       8  7.5     7.5 
##  7 subtract  multiply V           0.83 COHYPONYMS        8       8  7.17    7.17
##  8 buy       sell     V           0    ANTONYMS          7       7  7       7   
##  9 exhale    inhale   V           1    ANTONYMS          8       8  7       7   
## 10 divide    add      V           0    COHYPONYMS        7       7  7       7   
## 11 ask       tell     V           0.66 HYPER/HYPO…       7       7  6.34    6.34
## 12 kick      punch    V           1.66 COHYPONYMS        8       8  6.34    6.34
## 13 reap      sow      V           0.66 NONE              7       7  6.34    6.34
## 14 disappear reappear V           0.66 NONE              7       7  6.34    6.34
## 15 push      tug      V           0.66 SYNONYMS          7       7  6.34    6.34
## 16 die       kill     V           0.83 NONE              7       7  6.17    6.17
## 17 need      want     V           1.99 SYNONYMS          8       8  6.01    6.01
## 18 spring    fall     V           1    NONE              7       7  6       6   
## 19 sell      purchase V           2.16 ANTONYMS          8       8  5.84    5.84
## 20 please    beg      V           1.16 NONE              7       7  5.84    5.84
## # … with abbreviated variable names ¹​`gpt-4_response`, ²​gpt4_rating, ³​abs_diff

Dataset 4: RAW-C

df_gpt = read_csv("../../data/processed/raw-c/raw-c_gpt-4.csv")
## Rows: 672 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): word, sentence1, sentence2, ambiguity_type, disambiguating_word1, ...
## dbl  (9): mean_relatedness, median_relatedness, diff, count, sd_relatedness,...
## lgl  (1): same
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_gpt)
## [1] 672
### How correlated?
cor.test(df_gpt$mean_relatedness, df_gpt$gpt4_rating)
## 
##  Pearson's product-moment correlation
## 
## data:  df_gpt$mean_relatedness and df_gpt$gpt4_rating
## t = 35.17, df = 670, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7770950 0.8304386
## sample estimates:
##       cor 
## 0.8053915
cor.test(df_gpt$mean_relatedness, df_gpt$gpt4_rating, method = "spearman")
## Warning in cor.test.default(df_gpt$mean_relatedness, df_gpt$gpt4_rating, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_gpt$mean_relatedness and df_gpt$gpt4_rating
## S = 9333386, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8154629
HUMAN_AGREEMENT_RAWC = 0.79

df_corr = df_gpt %>%
  summarise(r = cor(mean_relatedness, gpt4_rating, method = "spearman")) %>%
  mutate(Dimension = "Relatedness (RAW-C)")

df_corr %>%
  ggplot(aes(x = Dimension, y = r)) +
  geom_bar(stat = "identity", alpha = .7) +
  scale_y_continuous(limits = c(0, 1)) +
  geom_hline(yintercept = HUMAN_AGREEMENT_RAWC, , linetype = "dashed",
             color = "steelblue3", size = 1) +
  labs(x = "", y = "Correlation") +
  theme_minimal()

df_gpt %>%
  ggplot(aes(x = gpt4_rating, y = mean_relatedness)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "GPT-4 Relatedness Judgment",
       y = "Human Relatedness Judgment")
## `geom_smooth()` using formula 'y ~ x'

Error Analysis

df_gpt = df_gpt %>%
  mutate(diff = gpt4_rating - mean_relatedness,
         abs_diff = abs(diff)) 

df_gpt %>%
  ggplot(aes(x = abs_diff)) +
  geom_histogram(alpha = .5, bins = 7)

df_gpt %>%
  ggplot(aes(y = reorder(same, abs_diff), x = abs_diff)) +
  geom_jitter(alpha = .1, width = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  theme_minimal() +
  labs(y = "Same Sense",
       x = "Absolute Error")

df_gpt %>%
  ggplot(aes(x = abs_diff,
             y = reorder(ambiguity_type, abs_diff),
             fill = same)) +
  geom_density_ridges2(aes(height = ..density..), 
                       color=gray(0.25), 
                       alpha = 0.5, 
                       scale=0.85, 
                       size=.9, 
                       stat="density") +
  labs(x = "Absolute error",
       y = "Ambiguity Type",
       fill = "Same vs. Different Sense") +
  theme(
    legend.position = "bottom"
  ) + 
  theme(axis.title = element_text(size=rel(1.5)),
        axis.text = element_text(size = rel(1.5)),
        legend.text = element_text(size = rel(1.5)),
        legend.title = element_text(size = rel(1.5)),
        strip.text.x = element_text(size = rel(1.5))) +
  theme_minimal()

summary(lm(data = df_gpt, abs_diff ~ same))
## 
## Call:
## lm(formula = abs_diff ~ same, data = df_gpt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02447 -0.39947 -0.02447  0.39220  1.88463 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.02447    0.02447   41.86   <2e-16 ***
## sameTRUE    -0.53430    0.04239  -12.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.518 on 670 degrees of freedom
## Multiple R-squared:  0.1917, Adjusted R-squared:  0.1905 
## F-statistic: 158.9 on 1 and 670 DF,  p-value: < 2.2e-16
### Get residuals
mod = lm(data = df_gpt, mean_relatedness ~ gpt4_rating + same)
summary(mod)
## 
## Call:
## lm(formula = mean_relatedness ~ gpt4_rating + same, data = df_gpt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05139 -0.57085 -0.08017  0.53396  2.37686 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.70089    0.11365  -6.167  1.2e-09 ***
## gpt4_rating  0.94773    0.05038  18.812  < 2e-16 ***
## sameTRUE     0.86544    0.09322   9.283  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7803 on 669 degrees of freedom
## Multiple R-squared:  0.6888, Adjusted R-squared:  0.6878 
## F-statistic: 740.2 on 2 and 669 DF,  p-value: < 2.2e-16
### Does GPT-4 predict human relatedness for same sense judgments?
mod = lm(data = filter(df_gpt, same == TRUE), mean_relatedness ~ gpt4_rating)
summary(mod)
## 
## Call:
## lm(formula = mean_relatedness ~ gpt4_rating, data = filter(df_gpt, 
##     same == TRUE))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9334 -0.2815  0.1442  0.3553  0.7029 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.25408    0.21132  10.667  < 2e-16 ***
## gpt4_rating  0.34766    0.05992   5.802 2.24e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5011 on 222 degrees of freedom
## Multiple R-squared:  0.1317, Adjusted R-squared:  0.1278 
## F-statistic: 33.66 on 1 and 222 DF,  p-value: 2.241e-08

Qualitative

dftop20 = df_gpt %>%
  arrange(desc(abs_diff)) %>%
  head(20)

summary(dftop20$rating)
## Warning: Unknown or uninitialised column: `rating`.
## Length  Class   Mode 
##      0   NULL   NULL
summary(dftop20$gpt4_rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0     2.0     2.5     2.5     3.0     3.0
dftop20 %>%
  select(word, sentence1, sentence2, mean_relatedness, gpt4_rating)
## # A tibble: 20 × 5
##    word    sentence1                    sentence2                mean_…¹ gpt4_…²
##    <chr>   <chr>                        <chr>                      <dbl>   <dbl>
##  1 cape    It was a red cape.           It was a rocky cape.      0.0909       3
##  2 punch   He had the alcoholic punch.  He had the strongest pu…  0.429        3
##  3 toast   They toasted the strudel.    They toasted the host.    0.462        3
##  4 perch   It was a tasty perch.        It was a secure perch.    0.6          3
##  5 cross   He crossed a room.           He crossed a friend.      0.643        3
##  6 cross   He crossed a road.           He crossed an enemy.      0.727        3
##  7 panel   It was a control panel.      It was an advisory pane…  0.8          3
##  8 bat     He saw a furry bat.          He saw a wooden bat.      0            2
##  9 call    They called the police.      They called the debt.     1            3
## 10 cape    It was a flowing cape.       It was a rocky cape.      1            3
## 11 fan     They had an electric fan.    They had an enthusiasti…  0            2
## 12 orange  It was a juicy orange.       It was a reddish orange.  1            3
## 13 board   It was the ironing board.    It was the executive bo…  0.0667       2
## 14 degree  It was about forty degrees.  It was about associate …  0.0714       2
## 15 fan     They had a ceiling fan.      They had an enthusiasti…  0.0714       2
## 16 pitcher He saw the fast pitcher.     He saw the glass pitche…  0.0714       2
## 17 pupil   She had an interested pupil. She had a dilated pupil.  0.0714       2
## 18 degree  It was about fifty degrees.  It was about associate …  0.0769       2
## 19 file    She had a data file.         She had a nail file.      0.0833       2
## 20 band    It was a rubber band.        It was a country band.    0.1          2
## # … with abbreviated variable names ¹​mean_relatedness, ²​gpt4_rating

Dataset 5a: Contextualized Perception Norms

df_gpt = read_csv("../../data/processed/cs_norms_perception/cs_norms_perception_gpt-4.csv")
## Rows: 448 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentence
## dbl (6): Hearing, Interoception, Olfaction, Taste, Touch, Vision
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_gpt)
## [1] 448
df_human = read_csv("../../data/raw/cs_norms_perception/cs_norms_perception.csv")
## Rows: 448 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): word, sentence, context
## dbl (13): Vision.M, Vision.SD, Hearing.M, Hearing.SD, Olfaction.M, Olfaction...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_human)
## [1] 448
df_merged = df_human %>%
  inner_join(df_gpt)
## Joining, by = c("word", "sentence")
nrow(df_merged)
## [1] 448
### How correlated?

df_summ = df_merged %>%
  summarise(Vision = cor(Vision.M, Vision, method = "spearman"),
            Hearing = cor(Hearing.M, Hearing, method = "spearman"),
            Touch = cor(Touch.M, Touch, method = "spearman"),
            Olfaction = cor(Olfaction.M, Olfaction, method = "spearman"),
            Taste = cor(Taste.M, Taste, method = "spearman"),
            Interoception = cor(Interoception.M, Interoception, method = "spearman"))

df_summ
## # A tibble: 1 × 6
##   Vision Hearing Touch Olfaction Taste Interoception
##    <dbl>   <dbl> <dbl>     <dbl> <dbl>         <dbl>
## 1  0.663   0.662 0.750     0.708 0.626         0.545
df_long = df_summ %>%
  pivot_longer(everything(), names_to = "Factor", values_to = "Correlation")

df_long %>%
  ggplot(aes(x = reorder(Factor, Correlation), y = Correlation)) +
  geom_bar(stat = "identity") +
  labs(x = "", y = "Correlation") +
  scale_y_continuous(limits = c(0,1)) +
  theme_minimal()

### calculate overall correlation
### calculate overall correlation
df_human_long = df_human %>%
  select(word, sentence, Vision.M,
                Hearing.M,
                Touch.M,
                Olfaction.M,
                Taste.M,
                Interoception.M) %>%
  pivot_longer(cols = c(Vision.M,
                Hearing.M,
                Touch.M,
                Olfaction.M,
                Taste.M,
                Interoception.M),
               names_to = "Dimension",
               values_to = "Strength_human") %>%
  mutate(Dimension = str_remove(Dimension, "\\.M$"))

df_gpt_long = df_gpt %>%
  select(word, sentence,Vision, Hearing,Touch,Olfaction,Taste,Interoception) %>%
  pivot_longer(cols = c(Vision, Hearing,Touch,Olfaction,Taste,Interoception),
               names_to = "Dimension",
               values_to = "Strength_GPT")

df_merged_long = df_human_long %>%
  inner_join(df_gpt_long, on = c(word, sentence))
## Joining, by = c("word", "sentence", "Dimension")
cor.test(df_merged_long$Strength_GPT, df_merged_long$Strength_human)
## 
##  Pearson's product-moment correlation
## 
## data:  df_merged_long$Strength_GPT and df_merged_long$Strength_human
## t = 81.152, df = 2686, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8314800 0.8534075
## sample estimates:
##       cor 
## 0.8427931
cor.test(df_merged_long$Strength_GPT, df_merged_long$Strength_human, method = "spearman")
## Warning in cor.test.default(df_merged_long$Strength_GPT,
## df_merged_long$Strength_human, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_merged_long$Strength_GPT and df_merged_long$Strength_human
## S = 528885361, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8366102
df_merged_long %>%
  ggplot(aes(x = Strength_GPT,
             y = Strength_human)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  labs(x = "Strength (GPT rating)",
       y = "Strength (Human rating)") +
  facet_wrap(~Dimension) +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

HUMAN_AGREEMENT_CSP = 0.64

df_corr = data.frame(r = cor(df_merged_long$Strength_human, 
                             df_merged_long$Strength_GPT, method = "spearman"),
                     dimension = "Perception Norms")


df_corr %>%
  ggplot(aes(x = dimension, y = r)) +
  geom_bar(stat = "identity", alpha = .7) +
  scale_y_continuous(limits = c(0, 1)) +
  geom_hline(yintercept = HUMAN_AGREEMENT_CSP, , linetype = "dashed",
             color = "steelblue3", size = 1) +
  labs(x = "", y = "Correlation") +
  theme_minimal()

Dataset 5b: Contextualized Action Norms

df_gpt = read_csv("../../data/processed/cs_norms_action/cs_norms_action_gpt-4.csv")
## Rows: 448 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentence
## dbl (5): Foot_leg, Hand_arm, Head, Mouth_throat, Torso
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_gpt)
## [1] 448
df_human = read_csv("../../data/raw/cs_norms_action/cs_norms_action.csv")
## Rows: 448 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): word, sentence, context
## dbl (11): Foot_leg.M, Foot_leg.SD, Mouth_throat.M, Mouth_throat.SD, Torso.M,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_human)
## [1] 448
df_merged = df_human %>%
  inner_join(df_gpt)
## Joining, by = c("word", "sentence")
nrow(df_merged)
## [1] 448
### How correlated?

df_summ = df_merged %>%
  summarise(Hand_arm = cor(Hand_arm.M, Hand_arm, method = "spearman"),
            Foot_leg = cor(Foot_leg.M, Foot_leg, method = "spearman"),
            Head = cor(Head.M, Head, method = "spearman"),
            Torso = cor(Torso.M, Torso, method = "spearman"),
            Mouth_throat = cor(Mouth_throat.M, Mouth_throat, method = "spearman"))

df_summ
## # A tibble: 1 × 5
##   Hand_arm Foot_leg  Head Torso Mouth_throat
##      <dbl>    <dbl> <dbl> <dbl>        <dbl>
## 1    0.640    0.564 0.447 0.584        0.558
df_long = df_summ %>%
  pivot_longer(everything(), names_to = "Factor", values_to = "Correlation")

df_long %>%
  ggplot(aes(x = reorder(Factor, Correlation), y = Correlation)) +
  geom_bar(stat = "identity") +
  labs(x = "Factor", y = "Correlation") +
  theme_minimal()

### calculate overall correlation
df_human_long = df_human %>%
  select(word, sentence, Mouth_throat.M, 
         Foot_leg.M, Hand_arm.M, Torso.M, Head.M) %>%
  pivot_longer(cols = c(Mouth_throat.M, Foot_leg.M, Hand_arm.M, Torso.M, Head.M),
               names_to = "Dimension",
               values_to = "Strength_human") %>%
  mutate(Dimension = str_remove(Dimension, "\\.M$"))

df_gpt_long = df_gpt %>%
  select(word, sentence, Mouth_throat, Foot_leg, Hand_arm, Torso, Head) %>%
  pivot_longer(cols = c(Mouth_throat, Foot_leg, Hand_arm, Torso, Head),
               names_to = "Dimension",
               values_to = "Strength_GPT")

df_merged_long = df_human_long %>%
  inner_join(df_gpt_long, on = c(word, sentence))
## Joining, by = c("word", "sentence", "Dimension")
cor.test(df_merged_long$Strength_GPT, df_merged_long$Strength_human)
## 
##  Pearson's product-moment correlation
## 
## data:  df_merged_long$Strength_GPT and df_merged_long$Strength_human
## t = 45.784, df = 2238, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6734235 0.7162299
## sample estimates:
##       cor 
## 0.6954431
cor.test(df_merged_long$Strength_GPT, df_merged_long$Strength_human, method = "spearman")
## Warning in cor.test.default(df_merged_long$Strength_GPT,
## df_merged_long$Strength_human, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_merged_long$Strength_GPT and df_merged_long$Strength_human
## S = 679132929, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.6374549
df_merged_long %>%
  ggplot(aes(x = Strength_GPT,
             y = Strength_human)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  labs(x = "Strength (GPT rating)",
       y = "Strength (Human rating)") +
  facet_wrap(~Dimension) +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

HUMAN_AGREEMENT_ACTION = 0.5

df_corr = data.frame(r = cor(df_merged_long$Strength_GPT, 
                             df_merged_long$Strength_human, 
                             method = "spearman"),
                     dimension = "Action Norms")


df_corr %>%
  ggplot(aes(x = dimension, y = r)) +
  geom_bar(stat = "identity", alpha = .7) +
  scale_y_continuous(limits = c(0, 1)) +
  geom_hline(yintercept = HUMAN_AGREEMENT_ACTION, , linetype = "dashed",
             color = "steelblue3", size = 1) +
  labs(x = "", y = "Correlation") +
  theme_minimal()

Dataset 6: Glasgow Norms

df_gpt = read_csv("../../data/processed/glasgow/glasgow_gpt-4.csv")
## Rows: 871 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): word
## dbl (9): AoA, Arousal, Concreteness, Dominance, Familiarity, Gender, Imageab...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_gpt)
## [1] 871
### Recode AoA
df_gpt = df_gpt %>%
  mutate(AoA2 = case_when(
    AoA <= 2 ~ 1, 
    AoA <= 4 ~ 2,
    AoA <= 6 ~ 3,
    AoA <= 8 ~ 4,
    AoA <= 10 ~ 5,
    AoA <= 12 ~ 6,
    AoA >= 13 ~ 7
  )) %>%
  mutate(AoA = AoA2)

df_human = read_csv("../../data/raw/glasgow/glasgow.csv")
## Rows: 871 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): word
## dbl (28): Length, Arousal.M, Arousal.SD, Arousal.N, Valence.M, Valence.SD, V...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_human)
## [1] 871
df_merged = df_human %>%
  inner_join(df_gpt) %>%
  drop_na()
## Joining, by = "word"
nrow(df_merged)
## [1] 870
### How correlated?

df_summ = df_merged %>%
  summarise(Valence = cor(Valence.M, Valence, method = "spearman"),
            Arousal = cor(Arousal.M, Arousal, method = "spearman"),
            Concreteness = cor(Concreteness.M, Concreteness, method = "spearman"),
            Familiarity = cor(Familiarity.M, Familiarity, method = "spearman"),
            Imageability = cor(Imageability.M, Imageability, method = "spearman"),
            Dominance = cor(Dominance.M, Dominance, method = "spearman"),
            AoA = cor(AoA.M, AoA, method = "spearman"),
            Size = cor(Size.M, Size, method = "spearman"),
            Gender = cor(Gender.M, Gender, method = "spearman"))

df_summ
## # A tibble: 1 × 9
##   Valence Arousal Concreteness Familiarity Imageabi…¹ Domin…²   AoA  Size Gender
##     <dbl>   <dbl>        <dbl>       <dbl>      <dbl>   <dbl> <dbl> <dbl>  <dbl>
## 1   0.756   0.661        0.814       0.707      0.743   0.385 0.717 0.686  0.471
## # … with abbreviated variable names ¹​Imageability, ²​Dominance
df_long = df_summ %>%
  pivot_longer(everything(), names_to = "Factor", values_to = "Correlation")

df_long %>%
  ggplot(aes(y = reorder(Factor, Correlation), x = Correlation)) +
  geom_bar(stat = "identity") +
  labs(y = "Factor", x = "Correlation") +
  theme_minimal()

### calculate overall correlation
### calculate overall correlation
df_human_long = df_human %>%
  select(word, Valence.M, Arousal.M, Concreteness.M, Familiarity.M, Imageability.M, Dominance.M, AoA.M, Size.M, Gender.M) %>%
  pivot_longer(cols = c(Valence.M, Arousal.M, Concreteness.M, Familiarity.M, Imageability.M, Dominance.M, AoA.M, Size.M, Gender.M),
               names_to = "Dimension",
               values_to = "Rating_human") %>%
  mutate(Dimension = str_remove(Dimension, "\\.M$"))

df_gpt_long = df_gpt %>%
  select(word,Valence, Arousal, Concreteness, Familiarity, Imageability, Dominance, AoA, Size, Gender) %>%
  pivot_longer(cols = c(Valence, Arousal, Concreteness, Familiarity, Imageability, Dominance, AoA, Size, Gender),
               names_to = "Dimension",
               values_to = "Rating_GPT")

df_merged_long = df_human_long %>%
  inner_join(df_gpt_long, on = c(word)) %>%
  drop_na()
## Joining, by = c("word", "Dimension")
cor.test(df_merged_long$Rating_GPT, df_merged_long$Rating_human)
## 
##  Pearson's product-moment correlation
## 
## data:  df_merged_long$Rating_GPT and df_merged_long$Rating_human
## t = 88.884, df = 7835, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6973734 0.7194273
## sample estimates:
##       cor 
## 0.7085733
cor.test(df_merged_long$Rating_GPT, df_merged_long$Rating_human, method = "spearman")
## Warning in cor.test.default(df_merged_long$Rating_GPT,
## df_merged_long$Rating_human, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_merged_long$Rating_GPT and df_merged_long$Rating_human
## S = 2.5043e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.687827
df_merged_long %>%
  ggplot(aes(x = Rating_GPT,
             y = Rating_human)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  labs(x = "Rating (GPT)",
       y = "Rating (Human)") +
  facet_wrap(~Dimension) +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

df_corr = data.frame(r = cor(df_merged_long$Rating_human,
                             df_merged_long$Rating_GPT, 
                             method = "spearman"),
                     dimension = "Glasgow Norms")


df_corr %>%
  ggplot(aes(x = dimension, y = r)) +
  geom_bar(stat = "identity", alpha = .7) +
  scale_y_continuous(limits = c(0, 1)) +
  labs(x = "", y = "Correlation") +
  theme_minimal()

Supplementary Analyses

GPT-3.5 Turbo

RAW-C

df_gpt = read_csv("../../data/processed/raw-c/raw-c_gpt-3.5-turbo.csv") %>%
  mutate(gpt_turbo = `gpt3.5-turbo_rating`)
## Rows: 672 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): sentence1, sentence2
## dbl (2): gpt-3.5-turbo_response, gpt3.5-turbo_rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_gpt)
## [1] 672
df_human = read_csv("../../data/raw/raw-c/raw-c.csv")
## Rows: 672 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): word, sentence1, sentence2, ambiguity_type, disambiguating_word1, ...
## dbl  (8): mean_relatedness, median_relatedness, diff, count, sd_relatedness,...
## lgl  (1): same
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_merged = df_human %>%
  inner_join(df_gpt)
## Joining, by = c("sentence1", "sentence2")
nrow(df_merged)
## [1] 672
### How correlated?
cor.test(df_merged$mean_relatedness, df_merged$gpt_turbo)
## 
##  Pearson's product-moment correlation
## 
## data:  df_merged$mean_relatedness and df_merged$gpt_turbo
## t = 38.772, df = 670, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8068094 0.8536278
## sample estimates:
##       cor 
## 0.8316911
cor.test(df_merged$mean_relatedness, df_merged$gpt_turbo, method = "spearman")
## Warning in cor.test.default(df_merged$mean_relatedness, df_merged$gpt_turbo, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_merged$mean_relatedness and df_merged$gpt_turbo
## S = 8468661, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##     rho 
## 0.83256
HUMAN_AGREEMENT_RAWC = 0.79

df_corr = df_merged %>%
  summarise(r = cor(mean_relatedness, gpt_turbo, method = "spearman")) %>%
  mutate(Dimension = "Relatedness (RAW-C)")

df_corr %>%
  ggplot(aes(x = Dimension, y = r)) +
  geom_bar(stat = "identity", alpha = .7) +
  scale_y_continuous(limits = c(0, 1)) +
  geom_hline(yintercept = HUMAN_AGREEMENT_RAWC, , linetype = "dashed",
             color = "steelblue3", size = 1) +
  labs(x = "", y = "Correlation") +
  theme_minimal()

df_merged %>%
  ggplot(aes(x = gpt_turbo, y = mean_relatedness)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "GPT-3.5 Turbo Relatedness Judgment",
       y = "Human Relatedness Judgment")
## `geom_smooth()` using formula 'y ~ x'

Error Analysis

df_merged = df_merged %>%
  mutate(diff = gpt_turbo - mean_relatedness,
         abs_diff = abs(diff)) 

df_merged %>%
  ggplot(aes(x = abs_diff)) +
  geom_histogram(alpha = .5, bins = 7)

df_merged %>%
  ggplot(aes(y = reorder(same, abs_diff), x = abs_diff)) +
  geom_jitter(alpha = .1, width = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  theme_minimal() +
  labs(y = "Same Sense",
       x = "Absolute Error")

df_merged %>%
  ggplot(aes(x = abs_diff,
             y = reorder(ambiguity_type, abs_diff),
             fill = same)) +
  geom_density_ridges2(aes(height = ..density..), 
                       color=gray(0.25), 
                       alpha = 0.5, 
                       scale=0.85, 
                       size=.9, 
                       stat="density") +
  labs(x = "Absolute error",
       y = "Ambiguity Type",
       fill = "Same vs. Different Sense") +
  theme(
    legend.position = "bottom"
  ) + 
  theme(axis.title = element_text(size=rel(1.5)),
        axis.text = element_text(size = rel(1.5)),
        legend.text = element_text(size = rel(1.5)),
        legend.title = element_text(size = rel(1.5)),
        strip.text.x = element_text(size = rel(1.5))) +
  theme_minimal()

summary(lm(data = df_merged, abs_diff ~ same))
## 
## Call:
## lm(formula = abs_diff ~ same, data = df_merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9690 -0.4333 -0.1336  0.3315  2.9310 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.96904    0.02986  32.449   <2e-16 ***
## sameTRUE    -0.48398    0.05172  -9.357   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6321 on 670 degrees of freedom
## Multiple R-squared:  0.1156, Adjusted R-squared:  0.1143 
## F-statistic: 87.55 on 1 and 670 DF,  p-value: < 2.2e-16
### Get residuals
mod = lm(data = df_merged, mean_relatedness ~ gpt_turbo + same)
summary(mod)
## 
## Call:
## lm(formula = mean_relatedness ~ gpt_turbo + same, data = df_merged)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.54203 -0.40698  0.00481  0.40139  2.29302 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.22807    0.07354  -3.101  0.00201 ** 
## gpt_turbo    0.71752    0.03029  23.690  < 2e-16 ***
## sameTRUE     0.88991    0.07867  11.312  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7115 on 669 degrees of freedom
## Multiple R-squared:  0.7412, Adjusted R-squared:  0.7404 
## F-statistic: 958.1 on 2 and 669 DF,  p-value: < 2.2e-16
### Does GPT-turbo predict human relatedness for same sense judgments?
mod = lm(data = filter(df_merged, same == TRUE), mean_relatedness ~ gpt_turbo)
summary(mod)
## 
## Call:
## lm(formula = mean_relatedness ~ gpt_turbo, data = filter(df_merged, 
##     same == TRUE))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1506 -0.2285  0.1221  0.3746  0.7069 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.39894    0.32835   4.260 3.01e-05 ***
## gpt_turbo    0.52883    0.08363   6.323 1.39e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 222 degrees of freedom
## Multiple R-squared:  0.1526, Adjusted R-squared:  0.1488 
## F-statistic: 39.99 on 1 and 222 DF,  p-value: 1.389e-09